# package
library(ape)
## Warning: package 'ape' was built under R version 3.5.2
library(clusterProfiler)
##
## clusterProfiler v3.10.1 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#bring in phase1 allele frequencies
phase1.all.freqs<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.all.freqs.csv")
#make column for id
phase1.all.freqs$id<-paste(phase1.all.freqs$chrom,phase1.all.freqs$POS)
#bring in dataframe with locality and environmental data for each individual phase1
phase1.alldata<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.allvariables.csv")
#subset to only the variables we need (lat,long,hum,temp,precip)
meta<-phase1.alldata[,c(16,17,18,20,22)]
#subset only the unique lat/longs
metapops<-unique(meta)
#sort by latitude to match the order of the allele frequency files
sort.pops.phase1 <-metapops[order(metapops$latitude),]
## read in lfmm outlier table
lfmm.outlier.table <- read.csv("~/Downloads/vetted.lfmm.prec.env.csv")[,2:3]
#add separate chrom and pos columns
lfmm.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,1]
lfmm.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,2]
## read in bayescan outlier table
bayescan.outlier.table <- read.csv("~/Downloads/vetted.bayescenv.prec.env.csv")[,2:3]
#add separate chrom and pos columns
bayescan.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,1]
bayescan.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,2]
## read in genomic regions and attributes file
genomic_location <- read.gff("~/Downloads/Anopheles_gambiae.AgamP4.47.chr.gff3.gz")
#keep only regions identified as "gene"
gen_regions <- genomic_location[genomic_location$type == "gene", ] # subset to only gene regions
#pull all genes queried
all.gene.id<-data.frame(do.call('rbind', strsplit(as.character(gen_regions$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP004677", "biotype=protein_coding",
## "description=methylenetetrahydrofolate dehydrogenase(NAD+) / 5%2C10-
## methenyltetrahydrofolate [Source:VB Community Annotation]", : number of columns
## of result is not a multiple of vector length (arg 1)
all.genes<-data.frame(do.call('rbind', strsplit(as.character(all.gene.id),':',fixed=TRUE)))[,2]
#write.table(all.genes, "~/Downloads/all.genes.anopheles.gambiae.txt", row.names = F, col.names = F, quote = F)
#define chroms
chroms <- c("2R", "2L", "3R", "3L","X") # chromosomes of interest
#initialize gene.list for lfmm
lfmm.gene.list<-data.frame()
#fill gene.list by matching location of genes of interest in gene feature regions by chromosomes
for (i in 1:length(chroms)){
genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
pos <- as.numeric(as.character(lfmm.outlier.table[lfmm.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
for (w in 1:length(pos)) {
if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
lfmm.gene.list<-rbind(lfmm.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
}
}
cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
##
## chromosome 2 of 5 finished
##
## chromosome 3 of 5 finished
##
## chromosome 4 of 5 finished
##
## chromosome 5 of 5 finished
#pull out gene ID as its own column
lfmm.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP001824", "biotype=protein_coding",
## "gene_id=AGAP001824", : number of columns of result is not a multiple of vector
## length (arg 1)
lfmm.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$gene.id),':',fixed=TRUE)))[,2]
#make SNP ID column
lfmm.gene.list$SNPid<-paste(lfmm.gene.list$seqid,lfmm.gene.list$`pos[w]`)
#subset
lfmm.gene.list<-lfmm.gene.list[,c("SNPid","gene")]
#number of candidate snps
nrow(lfmm.outlier.table)
## [1] 5594
#calculate percentage of lfmm candidate SNPs that were in a gene
nrow(lfmm.gene.list)/nrow(lfmm.outlier.table)
## [1] 0.5861637
#calc % of the genome covered by the genes we are searching
sum(gen_regions$end-gen_regions$start)/278268413
## [1] 0.3935553
table(lfmm.gene.list$gene)[table(lfmm.gene.list$gene) >10]
##
## AGAP000519 AGAP000801 AGAP001022 AGAP001023 AGAP001027 AGAP001029 AGAP001035
## 36 18 53 11 26 28 25
## AGAP001038 AGAP001039 AGAP001043 AGAP001046 AGAP001047 AGAP001048 AGAP001052
## 104 32 117 36 16 139 203
## AGAP001053 AGAP001061 AGAP001064 AGAP001065 AGAP001068 AGAP001069 AGAP001070
## 69 330 29 18 32 62 146
## AGAP001073 AGAP001076 AGAP001082 AGAP001083 AGAP001084 AGAP001090 AGAP001091
## 223 43 55 14 72 52 84
## AGAP001093 AGAP001094 AGAP004707 AGAP007731 AGAP007732 AGAP007736 AGAP007761
## 80 36 11 15 35 118 17
## AGAP010293 AGAP010295 AGAP010310 AGAP010311 AGAP010312 AGAP010313 AGAP010314
## 15 51 25 21 39 20 34
## AGAP013021 AGAP013341 AGAP029563
## 163 19 30
hist(table(lfmm.gene.list$gene))
#write.table(unique(lfmm.gene.list$gene), "~/Downloads/lfmm.prec.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(lfmm.gene.list, "~/Downloads/lfmm.prec.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#initialize gene.list for bayescan
bayescan.gene.list<-data.frame()
#fill gene.list
for (i in 1:length(chroms)){
genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
pos <- as.numeric(as.character(bayescan.outlier.table[bayescan.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
for (w in 1:length(pos)) {
if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
bayescan.gene.list<-rbind(bayescan.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
}
}
cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
##
## chromosome 2 of 5 finished
##
## chromosome 3 of 5 finished
##
## chromosome 4 of 5 finished
##
## chromosome 5 of 5 finished
#pull out gene ID as its own column
bayescan.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP001824", "biotype=protein_coding",
## "gene_id=AGAP001824", : number of columns of result is not a multiple of vector
## length (arg 1)
bayescan.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$gene.id),':',fixed=TRUE)))[,2]
#make SNP ID column
bayescan.gene.list$SNPid<-paste(bayescan.gene.list$seqid,bayescan.gene.list$`pos[w]`)
#subset
bayescan.gene.list<-bayescan.gene.list[,c("SNPid","gene")]
#number of candidate snps
nrow(bayescan.outlier.table)
## [1] 1261
#calculate percentage of bayescan candidate SNPs that were in a gene
nrow(bayescan.gene.list)/nrow(bayescan.outlier.table)
## [1] 0.5099128
#calc % of the genome covered by the genes we are searching
sum(gen_regions$end-gen_regions$start)/278268413
## [1] 0.3935553
table(bayescan.gene.list$gene)[table(bayescan.gene.list$gene) >10]
##
## AGAP000893 AGAP000915 AGAP000940 AGAP000961 AGAP000962 AGAP000969 AGAP000974
## 19 19 15 22 48 11 13
## AGAP000984 AGAP000998 AGAP001015 AGAP001023 AGAP001052 AGAP001094 AGAP001674
## 37 16 24 12 11 12 11
## AGAP001824
## 47
hist(table(bayescan.gene.list$gene))
#write.table(unique(bayescan.gene.list$gene), "~/Downloads/bayescan.prec.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(bayescan.gene.list, "~/Downloads/bayescan.prec.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#look into how many genes overlap between the methods
intersect(unique(lfmm.gene.list$gene),unique(bayescan.gene.list$gene))
## [1] "AGAP001824" "AGAP029626" "AGAP002916" "AGAP029619" "AGAP001674"
## [6] "AGAP004707" "AGAP006347" "AGAP006590" "AGAP029578" "AGAP007736"
## [11] "AGAP007754" "AGAP007761" "AGAP007762" "AGAP007763" "AGAP010816"
## [16] "AGAP010817" "AGAP000844" "AGAP000847" "AGAP029672" "AGAP000962"
## [21] "AGAP000974" "AGAP001022" "AGAP001023" "AGAP001025" "AGAP001029"
## [26] "AGAP001031" "AGAP001034" "AGAP001035" "AGAP001036" "AGAP001037"
## [31] "AGAP001038" "AGAP001039" "AGAP001040" "AGAP001043" "AGAP001046"
## [36] "AGAP001048" "AGAP001052" "AGAP001053" "AGAP013021" "AGAP001061"
## [41] "AGAP001064" "AGAP001065" "AGAP001069" "AGAP001070" "AGAP001073"
## [46] "AGAP001076" "AGAP001082" "AGAP001083" "AGAP001084" "AGAP001091"
## [51] "AGAP001092" "AGAP001093" "AGAP001094"
LFMM overrep testing
#overrepresentation testing
#read in each GO term to gene map (col1=geneID,col2=GOterm)
mart.export<-read.table("~/Downloads/mart_export (3).txt", sep = "\t", header = T)
#calculate overrepresentation for lfmm prec
lfmm.prec.enriched<-enricher(gene = lfmm.gene.list$gene,
universe = all.genes,
pAdjustMethod = "fdr",
TERM2GENE= mart.export[,c(2,1)]
)
result<-lfmm.prec.enriched@result
lfmm.prec.enriched.sig<-result[result$p.adjust <= .05,]
lfmm.prec.enriched.sig
#there are no significantly enriched GO terms
result[1:5,]
#write.table(lfmm.prec.enriched.sig[,c("ID","geneID")], file="~/Downloads/lfmm.prec.go.overrep.txt",
# row.names = F, col.names = F, quote = F)
Bayescenv overrep testing
#calculate overrepresentation for bayescan prec
bayescan.prec.enriched<-enricher(gene = bayescan.gene.list$gene,
universe = all.genes,
pAdjustMethod = "fdr",
TERM2GENE= mart.export[,c(2,1)]
)
result<-bayescan.prec.enriched@result
#retain significantly enriched GO terms
bayescan.prec.enriched.sig<-result[result$p.adjust <= .05,]
#check out significantly overrepresented GO terms
bayescan.prec.enriched.sig
#get a vector of genes we found SNPs in for each enriched GO term
#bayescan
go.0042391<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0042391"]),'/'))
go.0007015<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0007015"]),'/'))
go.0050877<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0050877"]),'/'))
go.0045202<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0045202"]),'/'))
go.0006811<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0006811"]),'/'))
go.0038023<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0038023"]),'/'))
go.0045211<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0045211"]),'/'))
go.0005216<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0005216"]),'/'))
go.0007268<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0007268"]),'/'))
go.0036459<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0036459"]),'/'))
go.0043005<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0043005"]),'/'))
bayescan.gene.list.go.0042391<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0042391,]
#plot all SNPs associated w/ GO:0042391
sig.pos<-bayescan.gene.list.go.0042391$SNPid
gene<-droplevels(bayescan.gene.list.go.0042391$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0007015<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0007015,]
#plot all SNPs associated w/ GO:0007015
sig.pos<-bayescan.gene.list.go.0007015$SNPid
gene<-droplevels(bayescan.gene.list.go.0007015$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0050877<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0050877,]
#plot all SNPs associated w/ GO:0050877
sig.pos<-bayescan.gene.list.go.0050877$SNPid
gene<-droplevels(bayescan.gene.list.go.0050877$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0045202<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0045202,]
#plot all SNPs associated w/ GO:0045202
sig.pos<-bayescan.gene.list.go.0045202$SNPid
gene<-droplevels(bayescan.gene.list.go.0045202$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0006811<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0006811,]
#plot all SNPs associated w/ GO:0006811
sig.pos<-bayescan.gene.list.go.0006811$SNPid
gene<-droplevels(bayescan.gene.list.go.0006811$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0038023<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0038023,]
#plot all SNPs associated w/ GO:0038023
sig.pos<-bayescan.gene.list.go.0038023$SNPid
gene<-droplevels(bayescan.gene.list.go.0038023$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0045211<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0045211,]
#plot all SNPs associated w/ GO:0045211
sig.pos<-bayescan.gene.list.go.0045211$SNPid
gene<-droplevels(bayescan.gene.list.go.0045211$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0005216<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0005216,]
#plot all SNPs associated w/ GO:0005216
sig.pos<-bayescan.gene.list.go.0005216$SNPid
gene<-droplevels(bayescan.gene.list.go.0005216$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0007268<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0007268,]
#plot all SNPs associated w/ GO:0007268
sig.pos<-bayescan.gene.list.go.0007268$SNPid
gene<-droplevels(bayescan.gene.list.go.0007268$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0036459<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0036459,]
#plot all SNPs associated w/ GO:0036459
sig.pos<-bayescan.gene.list.go.0036459$SNPid
gene<-droplevels(bayescan.gene.list.go.0036459$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}
bayescan.gene.list.go.0043005<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0043005,]
#plot all SNPs associated w/ GO:0043005
sig.pos<-bayescan.gene.list.go.0043005$SNPid
gene<-droplevels(bayescan.gene.list.go.0043005$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
abline(lm(frq~sort.pops.phase1$hannual))
}